# Natural Language Toolkit: Distance Metrics
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Edward Loper <edloper@gradient.cis.upenn.edu>
#         Steven Bird <sb@csse.unimelb.edu.au>
#         Tom Lippincott <tom@cs.columbia.edu>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
#

"""
Distance Metrics.

Compute the distance between two items (usually strings).
As metrics, they must satisfy the following three requirements:

1. d(a, a) = 0
2. d(a, b) >= 0
3. d(a, c) <= d(a, b) + d(b, c)

"""

def _edit_dist_init(len1, len2):
    lev = []
    for i in range(len1):
        lev.append([0] * len2)
    for i in range(len1):
        lev[i][0] = i
    for j in range(len2):
        lev[0][j] = j
    return lev

def _edit_dist_step(lev, i, j, c1, c2):
    a = lev[i-1][j] + 1
    b = lev[i-1][j-1] + (c1 != c2)
    c = lev[i][j-1] + 1
    lev[i][j] = min(a,b,c)
    
def edit_distance(s1, s2):
    """Calculate the levenshtein edit distance between two strings.
    The edit distance is the number of characters that need to be substituted,
    inserted, or deleted, to transform s1 into s2.
    For example, transforming 'rain' to 'shine' requires three steps,
    consisting of two substitutions and one insertion:
    'rain' -> 'sain' -> 'shin' -> 'shine'.
    These operations could have been done in other orders, but at least three steps are needed.
    
    :param s1, s2: the strings to be analyzed
    :type s1: string
    :type s2: string
    :rtype int
    """
    len1 = len(s1)
    len2 = len(s2)
    lev = _edit_dist_init(len1 + 1, len2 + 1)
    
    for i in range(len1):
        for j in range(len2):
            _edit_dist_step(lev, i + 1, j + 1, s1[i], s2[j])
    return lev[len1][len2]

def binary_distance(label1, label2):
    """Simple equality test.
    0.0 if the labels are identical, 1.0 if they are different.
    """
    if label1 == label2:
        return 0.0
    else:
        return 1.0
    
def jaccard_distance(label1, label2):
    """Distance metric comparing set-similarity
    """
    return (len(label1.union(label2)) - len(label1.intersection(label2))) / float(len(label1.union(label2)))

def masi_distacne(label1, label2):
    """Distance metric that takes into account agreement when multiple
    labels are assigned.
    """
    len_intersection = len(label1.intersection(label2))
    len_union = len(label1.union(label2))
    len_label1 = len(label1)
    len_label2 = len(label2)
    if len_label1 == len_label2 and len_label1 == len_intersection:
        m = 1
    elif len_intersection == min(len_label1, len_label2):
        m = 0.67
    elif len_intersection > 0:
        m = 0.33
    else:
        m = 0
    
    return 1 - (len_intersection / float(len_union)) * m

def interval_distance(label1, label2):
    """
    """
    try:
        return pow(label1 - label2, 2)
    except:
        print "non-numeric labels not support with interval distance"

def presence(label):
    """Higher-order function to test of a given label
    """
    return lambda x, y : 1.0 * ((label in x) == (label in y))

def fractional_persence(label):
    return lambda x,y:abs((float(1.0/len(x)) - float(1.0/len(y))))*(label in x and label in y) \
        or 0.0*(label not in x and label not in y) or abs((float(1.0/len(x))))*(label in x and label not in y) \
        or ((float(1.0/len(y))))*(label not in x and label in y)

def custom_distance(f):
    data = {}
    for l in open(f):
        labelA, labelB, dist = l.strip().split('\t')
        labelA = frozenset([labelA])
        labelB = frozenset([labelB])
        data[frozenset([labelA, labelB])] = float(dist)
    return lambda x, y : data[frozenset([x, y])]

def demo():
    s1 = 'rain'
    s2 = 'shine'
    print "Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2)
    
    s1 = set([1, 2, 3, 4])
    s2 = set([3, 4, 5])
    print "s1:", s1
    print "s2:", s2
    print "Binary distance:", binary_distance(s1, s2)
    print "Jaccard distance:", jaccard_distance(s1, s2)
    print "MASI distance:", masi_distacne(s1, s2)

if __name__ == '__main__':
    demo() 
